In [1]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf
%pylab inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model
In [2]:
def runModel(dataset, income, varForModel):
'''
This function takes a data set, runs a model according to specifications,
and returns the model, printing the summary
'''
y = dataset[income].values
X = dataset.loc[:,varForModel].values
X = sm.add_constant(X)
w = dataset.PONDERA
lm = sm.WLS(y, X, weights=1. / w, missing = 'drop', hasconst=True).fit()
print lm.summary()
for i in range(1,len(varForModel)+1):
print 'x%d: %s' % (i,varForModel[i-1])
#testing within sample
R_IS=[]
R_OS=[]
n=500
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 200)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
lm = linear_model.LinearRegression(fit_intercept=True)
lm.fit(X_train, y_train, sample_weight = 1. / w[:len(X_train)])
y_hat_IS = lm.predict(X_train)
err_IS = y_hat_IS - y_train
R2_IS = 1 - (np.var(err_IS) / np.var(y_train))
y_hat_OS = lm.predict(X_test)
err_OS = y_hat_OS - y_test
R2_OS = 1 - (np.var(err_OS) / np.var(y_test))
R_IS.append(R2_IS)
R_OS.append(R2_OS)
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))
In [3]:
data = pd.read_csv('data/dataFinalParaModelo.csv')
data = data[data.AGLO1 == 32.0]
data.head()
Out[3]:
In [4]:
varForModel = [
'HomeType',
'RoomsNumber',
'FloorMaterial',
'RoofMaterial',
'RoofCoat',
'Water',
'Toilet',
'ToiletLocation',
'ToiletType',
'Sewer',
'EmergencyLoc',
'UsableTotalRooms',
'SleepingRooms',
'Kitchen',
'Sink',
'Ownership',
'CookingCombustible',
'BathroomUse',
'Working',
'HouseMembers',
'Memberless10',
'Membermore10',
'TotalFamilyIncome',
'CookingRec',
'WaterRec',
'OwnershipRec',
'Hacinamiento',
'schoolAndJob',
'noJob',
'job',
'headAge',
'spouseAge',
'headFemale',
'spouseFemale',
'headEduc',
'spouseEduc',
'headPrimary',
'spousePrimary',
'headSecondary',
'spouseSecondary',
'headUniversity',
'spouseUniversity',
'headJob',
'spouseJob',
'headMaritalStatus',
'spouseMaritalStatus',
'sumPredicted']
In [5]:
data['hasSpouse'] = np.where(np.isnan(data.spouseJob.values),0,1)
data['spouseJob'] = np.where(np.isnan(data.spouseJob.values),0,data.spouseJob.values)
data['TotalFamilyIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
data = data[data.TotalFamilyIncomeDecReg != 0]
data['income_log'] = np.log(data.TotalFamilyIncome)
data['FloorMaterial'] = np.where(np.isnan(data.FloorMaterial.values),5,data.FloorMaterial.values)
data['sumPredicted'] = np.where(np.isnan(data.sumPredicted.values),0,data.sumPredicted.values)
data['Sewer'] = np.where(np.isnan(data.Sewer.values),5,data.Sewer.values)
data['ToiletType'] = np.where(np.isnan(data.ToiletType.values),4,data.ToiletType.values)
data['Water'] = np.where(np.isnan(data.Water.values),4,data.Water.values)
data['RoofCoat'] = np.where(np.isnan(data.RoofCoat.values),2,data.RoofCoat.values)
data['income_logPer'] = np.log(data.PerCapInc)
data['haciBool'] = (data.Hacinamiento > 3).astype(int)
data['RoofMaterial'] = np.where(np.isnan(data.RoofMaterial.values),0,data.RoofMaterial.values)
data['ToiletLocation'] = np.where(np.isnan(data.ToiletLocation.values),2,data.ToiletLocation.values)
In [6]:
import seaborn as sns
sns.set(context="paper", font="monospace", font_scale=1.25)
corrmat = data.loc[:,list(data.loc[:,varForModel].corr()['TotalFamilyIncome'].sort_values(ascending=False).index)].corr()
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
In [7]:
varHomogamy = [
'headAge',
'spouseAge',
'headFemale',
'spouseFemale',
'headEduc',
'spouseEduc',
'headPrimary',
'spousePrimary',
'headSecondary',
'spouseSecondary',
'headUniversity',
'spouseUniversity',
'headJob',
'spouseJob',
'headMaritalStatus',
'spouseMaritalStatus']
sns.set(context="paper", font="monospace", font_scale=2)
corrmat = data.loc[:,varHomogamy].corr()
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
Notes:
We found multi-collinearity between variables referencing the spouse and the head of the household. This is we believe a case of homogamy (marriage between individuals who are similar to each other). This is why we chose to ignore them.
In [8]:
varForFeatureSelection = [
'HomeType',
'RoomsNumber',
'FloorMaterial',
'RoofMaterial',
'RoofCoat',
'Water',
'Toilet',
'ToiletLocation',
'ToiletType',
'Sewer',
'UsableTotalRooms',
'SleepingRooms',
'Kitchen',
'Sink',
'Ownership',
'CookingCombustible',
'BathroomUse',
'Working',
'HouseMembers',
'Memberless10',
'Membermore10',
'CookingRec',
'WaterRec',
'OwnershipRec',
'Hacinamiento',
'schoolAndJob',
'noJob',
'job',
'headAge',
'headFemale',
'headEduc',
'headPrimary',
'headSecondary',
'headUniversity',
'headJob',
'sumPredicted']
In [9]:
# !pip install minepy
from sklearn.linear_model import (LinearRegression, Ridge,
Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from minepy import MINE
Y = data.TotalFamilyIncome
X = np.asarray(data.loc[:,varForFeatureSelection])
names = data.loc[:,varForFeatureSelection].columns
ranks = {}
def rank_to_dict(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x, 2), ranks)
return dict(zip(names, ranks ))
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X,Y)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
f, pval = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
mine.compute_score(X[:,i], Y)
m = mine.mic()
mic_scores.append(m)
ranks["MIC"] = rank_to_dict(mic_scores, names)
r = {}
for name in names:
r[name] = round(np.mean([ranks[method][name]
for method in ranks.keys()]), 2)
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
feat_ranking = pd.DataFrame(ranks)
cols = feat_ranking.columns.tolist()
feat_ranking = feat_ranking.ix[:, cols]
feat_ranking.sort_values(['Corr.'], ascending=False).head(15)
Out[9]:
In [10]:
varComparison = list(feat_ranking.sort_values(['Corr.'], ascending=False).head(10).index)
print 'Our first iteration gave us the following table of the top 10 most relevant features: \n'
print varComparison
print '\n'
print 'And this is the correlation Matrix for those features:'
sns.set(context="paper", font="monospace", font_scale=2)
corrmat = data.loc[:,varComparison].corr()
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
Conclusion from Feature Selection:
We found that the most relevant features for predicting income are:
Accordingly, we chose only the variables that best represented this idea based on their predictive power, model interpretability and possibility of query under REDATAM. We also removed features highty correlated between each other to avoid multi-collinearity.
In [11]:
varForModel = [
'headEduc',
]
runModel(data, 'TotalFamilyIncome', varForModel)
In [12]:
varForModel = [
'headEduc',
'job',
]
runModel(data, 'TotalFamilyIncome', varForModel)
In [13]:
varForModel = [
'headEduc',
'job',
'SleepingRooms',]
runModel(data, 'TotalFamilyIncome', varForModel)
In [21]:
varForModel = [
'headEduc',
'job',
'haciBool',
#'Hacinamiento'
]
runModel(data, 'TotalFamilyIncome', varForModel)
In [16]:
varForModel = [
'headEduc',
'job',
'Hacinamiento'
]
runModel(data, 'TotalFamilyIncome', varForModel)
In [11]:
varForModel = [
'schoolAndJob',
]
runModel(data, 'TotalFamilyIncome', varForModel)
In [ ]:
varForModel = [
'SleepingRooms',
'schoolAndJob',
]
runModel(data, 'TotalFamilyIncome', varForModel)
GET SURVEY DATA
In [23]:
dbf = simpledbf.Dbf5('data/BaseEAH2010/EAH10_BU_IND_VERSION2.dbf') # PDF press release is available for download
data10 = dbf.to_dataframe()
data10 = data10.loc[data10.ITFB != 9999999,['ID','COMUNA','FEXP','ITFB']]
data10.head()
Out[23]:
DATA CLEANING
In [24]:
data10.drop_duplicates(inplace = True)
data10.ITFB.replace(to_replace=[0], value=[1] , inplace=True, axis=None)
data10.FEXP = data10.FEXP.astype(int)
data10exp = data10.loc[np.repeat(data10.index.values,data10.FEXP)]
data10exp.ITFB.groupby(by=data10exp.COMUNA).mean()
Out[24]:
GET PREDICTED DATA FROM REDATAM
The following script transforms the ascii output from REDATAM into a csv file that we can work on later. Our objective is to compare the predicted income from our models related to each comune with the real income for each comune.
In [25]:
def readRedatamCSV(asciiFile):
f = open(asciiFile, 'r')
areas = []
measures = []
for line in f:
columns = line.strip().split()
# print columns
if len(columns) > 0:
if 'RESUMEN' in columns[0] :
break
elif columns[0] == 'AREA':
area = str.split(columns[2],',')[0]
areas.append(area)
elif columns[0] == 'Total':
measure = str.split(columns[2],',')[2]
measures.append(measure)
try:
data = pd.DataFrame({'area':areas,'measure':measures})
return data
except:
print asciiFile
def R2(dataset,real,predicted):
fig = plt.figure(figsize=(24,6))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
error = dataset[predicted]-dataset[real]
ax1.scatter(dataset[predicted],dataset[real])
ax1.plot(dataset[real], dataset[real], color = 'red')
ax1.set_title('Predicted vs Real')
ax2.scatter((dataset[predicted] - dataset[predicted].mean())/dataset[predicted].std(),
(dataset[real] - dataset[real].mean())/dataset[real].std())
ax2.plot((dataset[real] - dataset[real].mean())/dataset[real].std(),
(dataset[real] - dataset[real].mean())/dataset[real].std(), color = 'red')
ax2.set_title('Standarized Predicted vs Real')
ax3.scatter(dataset[predicted],(error - error.mean()) / error.std())
ax3.set_title('Standard Error')
print "R^2 is: ",((dataset[real] - dataset[predicted])**2).sum() / ((dataset[real] - dataset[real].mean())**2).sum()
print 'Mean Error', error.mean()
Model 1A
In [43]:
archivo = 'data/indecOnline/headEduc/comunas.csv' # Model 1A
ingresoXComuna = readRedatamCSV(archivo)
ingresoXComuna.columns = ['area','Predicted_1A']
ingresoXComuna['Real_Income'] = list(data10exp.ITFB.groupby(by=data10exp.COMUNA).mean())
ingresoXComuna = ingresoXComuna.loc[:,['area', 'Real_Income', 'Predicted_1A']]
Model 1B
In [44]:
archivo = 'data/indecOnline/headEducYjobs/comuna.csv' # Model 1B
ingresoModelo2 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo2,on='area')
Model 1C
In [45]:
archivo = 'data/indecOnline/headEducuJobsYrooms/comunas.csv' # Model 1A
ingresoModelo3 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo3,on='area')
Model 2A
In [46]:
archivo = 'data/indecOnline/jobSchool/comunas.csv' # Model 2A
ingresoModelo4 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo4,on='area')
Model 2B
In [47]:
archivo = 'data/indecOnline/jobSchoolYrooms/comunas.csv' # Model 2B
ingresoModelo5 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo5,on='area')
Model 1D
In [48]:
archivo = 'data/indecOnline/MODELO1D/comunas.csv' # Model 2B
ingresoModelo6 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo6,on='area')
Model 1E
In [49]:
archivo = 'data/indecOnline/MODELO1E/comunas.csv' # Model 2B
ingresoModelo7 = readRedatamCSV(archivo)
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo7,on='area')
DATA CLEANING
In [50]:
ingresoXComuna.columns = ['Comune','Real_Income','Predicted_1A','Predicted_1B','Predicted_1C',
'Predicted_2A','Predicted_2B','Predicted_1D','Predicted_1E']
for i in range(1,9):
ingresoXComuna.iloc[:,[i]] = ingresoXComuna.iloc[:,[i]].astype(float)
MODEL EVALUATION RESULTS
In [51]:
ingresoXComuna
Out[51]:
In [56]:
R2 (ingresoXComuna,'Real_Income','Predicted_1E')
In [246]:
R2 (ingresoXComuna,'Real_Income','Predicted_1A')
In [247]:
R2 (ingresoXComuna,'Real_Income','Predicted_1B')
In [248]:
R2 (ingresoXComuna,'Real_Income','Predicted_1C')
In [53]:
R2 (ingresoXComuna,'Real_Income','Predicted_1D')
In [54]:
R2 (ingresoXComuna,'Real_Income','Predicted_1E')
In [249]:
R2 (ingresoXComuna,'Real_Income','Predicted_2A')
In [250]:
R2 (ingresoXComuna,'Real_Income','Predicted_2B')